Load required packages and read in combined data.
#packages
pacman::p_load(dplyr,
tidyr,
ggplot2,
rjson,
rdatacite,
cowplot,
stringr,
knitr,
DT)
#Load the combined data from 3_Combined_data.R
load(file="data_rdata_files/Combined_ALL_data.Rdata")
#subset the data to published years >= 2012
all_dois <- combined_dois %>%
filter(publicationYear >= 2012)
Look at dois by their origin
all_dois %>%
group_by(group) %>%
summarize(count=n()) %>%
kable()
| group | count |
|---|---|
| Affiliation - CrossRef | 147702 |
| Affiliation - Datacite | 51053 |
| IR_publisher | 24104 |
General data cleaning
#DRUM is inconsistently specified (with and without DRUM)
all_dois$publisher[grep("Data Repository for the University of Minnesota", all_dois$publisher)] <- "Data Repository for the University of Minnesota (DRUM)"
#Remove morphosource data, as affiliation isn't included
all_dois2 <- all_dois[-which(all_dois$publisher_plus == "Duke-MorphoSource Media"),]
#make sure dataset is capitalized in all metadata resource types
all_dois2[which(all_dois2$resourceTypeGeneral == "dataset"),]$resourceTypeGeneral <- "Dataset"
Look at all the Institutional Repositories Captured
IR_pubs <- all_dois2 %>%
filter(group == "IR_publisher") %>%
group_by(publisher_plus) %>%
summarize(count = n())
IR_pubs %>%
kable(col.names = c("Institutional Repository", "Count"))
| Institutional Repository | Count |
|---|---|
| Cornell | 4758 |
| Duke-Duke Digital Repository | 76 |
| Duke-Research Data Repository, Duke University | 147 |
| Michigan | 10 |
| Michigan-Deep Blue | 637 |
| Michigan-ICPSR/ISR | 109 |
| Michigan-Other | 57 |
| Minnesota | 692 |
| Virginia Tech | 333 |
| Washington U | 4085 |
Replace all of these publishers with “Institutional Repository” so that they will be represented in a single bar.
all_dois2$publisher[which(all_dois2$publisher_plus %in% unique(IR_pubs$publisher_plus))] <- "Institutional Repository"
#catch the rest of the "Cornell University Library"
all_dois2$publisher[which(all_dois2$publisher == "Cornell University Library")] <- "Institutional Repository"
#and stray VT
all_dois2$publisher[which(all_dois2$publisher == "University Libraries, Virginia Tech")] <- "Institutional Repository"
#and DRUM
all_dois2$publisher[which(all_dois2$publisher == "Data Repository for the University of Minnesota (DRUM)")] <- "Institutional Repository"
##ICPSR is also inconsistent
all_dois2$publisher[grep("Consortium for Political", all_dois$publisher)] <- "ICPSR"
Counts by publisher
by_publisher <- all_dois2 %>%
group_by(institution, publisher) %>%
summarize(count=n()) %>%
arrange(institution, desc(count))
by_publisher %>%
datatable()
Counts by resource type
by_resource <- all_dois2 %>%
group_by(institution, resourceTypeGeneral) %>%
summarize(count=n()) %>%
arrange(institution, desc(count))
Create a table of top resources
by_resource_table <- by_resource %>%
#filter(resourceTypeGeneral %in% c("Dataset", "Software", "Text", "Image")) %>%
pivot_wider(names_from = institution,
values_from = count,
values_fill = 0) %>%
rowwise %>%
mutate(Total = sum(c_across(Cornell:`Washington U`))) %>%
arrange(desc(Total))
by_resource_table %>%
datatable
Write out the resources
write.csv(by_resource_table, file = "data_summary_data/Counts of Resource Types by Insitution.csv", row.names = F)
Subset to only datasets
data_dois <- all_dois2 %>%
filter(resourceTypeGeneral == "Dataset")
Data DOIs by publisher
by_publisher_data <- data_dois %>%
group_by(publisher, institution) %>%
summarize(count=n()) %>%
arrange(institution, desc(count))
by_publisher_data_table <- by_publisher_data %>%
pivot_wider(names_from = institution,
values_from = count,
values_fill = 0) %>%
rowwise %>%
mutate(Total = sum(c_across(Cornell:`Washington U`))) %>%
arrange(desc(Total))
by_publisher_data_table %>%
datatable()
Write out the table of data publishers
write.csv(by_publisher_data_table, file="data_summary_data/Counts of Data Publishers By Insitituion.csv", row.names = F)
Subset to only software (only datacite has software)
software_dois <- all_dois2 %>%
filter(resourceTypeGeneral == "Software")
by_publisher_software <- software_dois %>%
group_by(publisher, institution) %>%
summarize(count=n()) %>%
arrange(institution, desc(count))
by_publisher_software_table <- by_publisher_software %>%
pivot_wider(names_from = institution,
values_from = count,
values_fill = 0) %>%
rowwise %>%
mutate(Total = sum(c_across(Cornell:`Washington U`))) %>%
arrange(desc(Total))
by_publisher_software_table %>%
datatable
Write out the table of software publishers
write.csv(by_publisher_software_table, file="data_summary_data/Counts of Software Publishers By Insitituion.csv", row.names = F)
Plot publishers by rank, ordered from most DOIs to least (take top 20)
by_publisher_data %>%
group_by(publisher) %>%
summarize(count=sum(count)) %>%
arrange(desc(count)) %>%
mutate(pubrank = order(count, decreasing = T)) %>%
ggplot(aes(x=pubrank, y=count)) +
geom_bar(stat="identity") +
scale_x_continuous(limits = c(0,20), n.breaks = 20) +
labs(x = "Publisher Rank", y="Number of DOIs", title="Number of DOIs by top Publishers")+
coord_cartesian(xlim = c(1,20)) +
theme_bw()
If we look at the top 9 publishers for the combined data (both DataCite and CrossRef), how many DOIs does this cover?
top9pubs <- by_publisher_data_table$publisher[1:9]
by_publisher_data %>%
group_by(publisher) %>%
summarize(count=sum(count)) %>%
mutate(intop9pub = publisher %in% top9pubs) %>%
group_by(intop9pub) %>%
summarize(totalDOIs = sum(count), nrepos = n()) %>%
ungroup() %>%
mutate(propDOIs = totalDOIs/sum(totalDOIs)) %>%
kable(col.names = c("In Top 9 Publishers", "Total N DOIs", "Total N Publishers", "Proportion of Total DOIs"))
| In Top 9 Publishers | Total N DOIs | Total N Publishers | Proportion of Total DOIs |
|---|---|---|---|
| FALSE | 2274 | 159 | 0.0128642 |
| TRUE | 174496 | 9 | 0.9871358 |
Plotting Number of DOIs in the top 8 publishers by institution
top9colors <- c("Harvard Dataverse" = "dodgerblue2",
"Zenodo" = "darkorange1",
"ICPSR" = "darkcyan",
"Dryad" = "lightgray",
"Qualitative Data Repository" = "gold1",
"figshare" = "purple",
"ENCODE Data Coordination Center" = "red",
"Faculty Opinions Ltd" = "darkgreen",
"Institutional Repository" = "lightblue")
(by_publisher_data_plot <- by_publisher_data %>%
filter(publisher %in% top9pubs) %>%
ggplot(aes(x=institution, y=count, fill=publisher)) +
geom_bar(stat="identity", position=position_dodge(preserve = "single")) +
scale_fill_manual(values = top9colors, name="Publisher")+
guides(fill = guide_legend(title.position = "top")) +
scale_y_continuous(breaks = seq(from = 0, to=5000, by=500)) +
coord_cartesian(ylim = c(0,5000)) +
labs(x = "Institution", y="Count of Data DOIs - CrossRef & DataCite", caption = "Note: Michigan Dataverse & ENCODEbar cutoff for scaling") +
theme_bw() +
theme(legend.position = "bottom", legend.title.align = .5))
ggsave(by_publisher_data_plot, filename = "figures/Counts of Data DOIs by Institution - CrossRef and DataCite.png", device = "png", width = 8, height = 6, units="in")
Look at the top software publishers (This excludes CrossRef affiliation data, as software is not a resource type).
top6pubs_soft <- by_publisher_software_table$publisher[1:6]
top6colors_soft <- c("Zenodo" = "darkorange1",
"Code Ocean" = "darkblue",
"Institutional Repository" = "lightblue",
"Optica Publishing Group" = "red",
"CoMSES Net" = "pink",
"figshare" = "purple")
(by_publisher_software_plot <- by_publisher_software %>%
filter(publisher %in% top6pubs_soft) %>%
ggplot(aes(x=institution, y=count, fill=publisher)) +
geom_bar(stat="identity", position=position_dodge(preserve = "single")) +
scale_fill_manual(values = top6colors_soft, name="Publisher")+
guides(fill = guide_legend(title.position = "top")) +
labs(x = "Institution", y="Count of Software DOIs") +
theme_bw() +
theme(legend.position = "bottom", legend.title.align = .5))
ggsave(by_publisher_software_plot, filename = "figures/Counts of Software DOIs by Institution.png", device = "png", width = 8, height = 6, units="in")
Subset to remove the CrossRef affiliation data from data DOIs
by_publisher_data_dc <- data_dois %>%
filter(group != "Affiliation - CrossRef") %>%
group_by(publisher, institution) %>%
summarize(count=n()) %>%
arrange(institution, desc(count))
by_publisher_data_dc_table <- by_publisher_data_dc %>%
pivot_wider(names_from = institution,
values_from = count,
values_fill = 0) %>%
rowwise %>%
mutate(Total = sum(c_across(Cornell:`Washington U`))) %>%
arrange(desc(Total))
by_publisher_data_dc_table %>%
datatable()
Look at publishers by rank of data DOIs
#publisher plots
by_publisher_data_dc %>%
group_by(publisher) %>%
summarize(count=sum(count)) %>%
arrange(desc(count)) %>%
mutate(pubrank = order(count, decreasing = T)) %>%
ggplot(aes(x=pubrank, y=count)) +
geom_bar(stat="identity") +
scale_x_continuous(limits = c(0,20), n.breaks = 20) +
coord_cartesian(xlim = c(1,20)) +
labs(x = "Publisher Rank", y="Number of DOIs", title="Number of DOIs by top Publishers") +
theme_bw()
Look at 7 publishers here. How many DOIs does this capture?
top7pubs <- by_publisher_data_dc_table$publisher[1:7]
by_publisher_data_dc %>%
group_by(publisher) %>%
summarize(count=sum(count)) %>%
mutate(intop7pub = publisher %in% top7pubs) %>%
group_by(intop7pub) %>%
summarize(totalDOIs = sum(count), nrepos = n()) %>%
ungroup() %>%
mutate(propDOIs = totalDOIs/sum(totalDOIs))
## # A tibble: 2 × 4
## intop7pub totalDOIs nrepos propDOIs
## <lgl> <int> <int> <dbl>
## 1 FALSE 2168 153 0.0746
## 2 TRUE 26900 7 0.925
top7colors <- c("Harvard Dataverse" = "dodgerblue2",
"Zenodo" = "darkorange1",
"ICPSR" = "darkcyan",
"Dryad" = "lightgray",
"Qualitative Data Repository" = "gold1",
"figshare" = "purple",
"Institutional Repository" = "lightblue")
(by_publisher_data_plot <- by_publisher_data_dc %>%
filter(publisher %in% top7pubs) %>%
ggplot(aes(x=institution, y=count, fill=publisher)) +
geom_bar(stat="identity", position=position_dodge(preserve = "single")) +
scale_fill_manual(values = top7colors, name="Publisher")+
guides(fill = guide_legend(title.position = "top")) +
scale_y_continuous(breaks = seq(from = 0, to=5000, by=500)) +
coord_cartesian(ylim = c(0,5000)) +
labs(x = "Institution", y="Count of Data DOIs", caption = "Note: Michigan Dataverse bar cutoff for scaling") +
theme_bw() +
theme(legend.position = "bottom", legend.title.align = .5))
ggsave(by_publisher_data_plot, filename = "figures/Counts of DataCite Data DOIs by Institution.png", device = "png", width = 8, height = 6, units="in")
Look at DOIs that came from the CrossRef search by affiliation.
by_crossref_publisher_data <- data_dois %>%
filter(group == "Affiliation - CrossRef") %>%
group_by(publisher, institution) %>%
summarize(count=n()) %>%
pivot_wider(names_from = institution,
values_from = count,
values_fill = 0) %>%
rowwise %>%
mutate(Total = sum(c_across(Cornell:`Washington U`))) %>%
arrange(desc(Total))
by_crossref_publisher_data %>%
datatable
(by_crossref_publisher_data_plot <- data_dois %>%
filter(group == "Affiliation - CrossRef") %>%
group_by(publisher, institution) %>%
summarize(count=n()) %>%
ggplot(aes(x=institution, y=count, fill=publisher)) +
geom_bar(stat="identity", position=position_dodge(preserve = "single")) +
guides(fill = guide_legend(title.position = "top")) +
scale_y_continuous(breaks = seq(from = 0, to=5000, by=500)) +
coord_cartesian(ylim = c(0,5000)) +
labs(x = "Institution", y="Count of Data DOIs", caption = "Note: Michigan ENCODE bar cutoff for scaling") +
theme_bw() +
theme(legend.position = "bottom", legend.title.align = .5))
ggsave(by_crossref_publisher_data_plot, filename = "figures/Counts of CrossRef Data DOIs by Institution.png", device = "png", width = 10, height = 6, units="in")
Some repositories (such as Harvard’s Dataverse and Qualitative Data Repository) assign DOIs at the level of the file, rather than the study. Similarly, Zenodo often has many related DOIs for multiple figures within a study. In order to attempt to compare study-to-study counts of data sharing, look at the DOIs collapsed by “container”.
by_container <-
all_dois2 %>%
filter(!is.na(container_identifier)) %>%
group_by(container_identifier, publisher, title, institution) %>%
summarize(count=n()) %>%
arrange(desc(count))
How many publishers have container DOIs?
by_container %>%
group_by(publisher) %>%
summarize(count=n()) %>%
arrange(desc(count)) %>%
datatable
Collapsing by container for counts
containerdups <- which(!is.na(all_dois2$container_identifier) & duplicated(all_dois2$container_identifier))
all_dois_collapsed <- all_dois2[-containerdups,]
data_dois_collapse <- all_dois_collapsed %>%
filter(resourceTypeGeneral == "Dataset")
by_publisher_data_collapse <- data_dois_collapse %>%
group_by(publisher, institution) %>%
summarize(count=n()) %>%
arrange(institution, desc(count))
Table of publisher counts
by_publisher_data_collapse_table <- by_publisher_data_collapse %>%
pivot_wider(names_from = institution,
values_from = count,
values_fill = 0) %>%
rowwise %>%
mutate(Total = sum(c_across(Cornell:`Washington U`))) %>%
arrange(desc(Total))
by_publisher_data_collapse_table %>%
datatable
Write out the table of data publishers
write.csv(by_publisher_data_collapse_table, file="data_summary_data/Counts of Data Publishers By Insitituion - Collapsed by container.csv", row.names = F)
For these graphs, will remove ENCODE and Faculty Opinions Ltd from visualization (CrossRef affiliations).
by_publisher_data_dc_collapse <- data_dois_collapse %>%
filter(group != "Affiliation - CrossRef") %>%
group_by(publisher, institution) %>%
summarize(count=n()) %>%
arrange(institution, desc(count))
#table of publishers - data
by_publisher_data_dc_collapse_table <- by_publisher_data_dc_collapse %>%
pivot_wider(names_from = institution,
values_from = count,
values_fill = 0) %>%
rowwise %>%
mutate(Total = sum(c_across(Cornell:`Washington U`))) %>%
arrange(desc(Total))
Look at publishers based on rank of number of DOIs
by_publisher_data_dc_collapse_table %>%
group_by(publisher) %>%
summarize(count=sum(Total)) %>%
arrange(desc(count)) %>%
mutate(pubrank = order(count, decreasing = T)) %>%
ggplot(aes(x=pubrank, y=count)) +
geom_bar(stat="identity") +
scale_x_continuous(limits = c(0,25)) +
labs(x = "Publisher Rank", y="Number of DOIs", title="Number of DOIs by top Publishers")+
theme_bw()
Look at the top 7 publishers - how many does this capture?
top7pubs <- by_publisher_data_dc_collapse_table$publisher[1:7]
by_publisher_data_dc_collapse_table %>%
group_by(publisher) %>%
summarize(count=sum(Total)) %>%
mutate(intop7pub = publisher %in% top7pubs) %>%
group_by(intop7pub) %>%
summarize(totalDOIs = sum(count), nrepos = n()) %>%
ungroup() %>%
mutate(propDOIs = totalDOIs/sum(totalDOIs))
## # A tibble: 2 × 4
## intop7pub totalDOIs nrepos propDOIs
## <lgl> <int> <int> <dbl>
## 1 FALSE 1441 153 0.105
## 2 TRUE 12228 7 0.895
top7colors <- c("Harvard Dataverse" = "dodgerblue2",
"Zenodo" = "darkorange1",
"ICPSR" = "darkcyan",
"Dryad" = "lightgray",
"figshare" = "purple",
"Institutional Repository" = "lightblue",
"Taylor & Francis" = "gold2")
(by_publisher_data_plot_collapse <- by_publisher_data_dc_collapse %>%
filter(publisher %in% top7pubs) %>%
ggplot(aes(x=institution, y=count, fill=publisher)) +
geom_bar(stat="identity", position=position_dodge(preserve = "single")) +
scale_fill_manual(values = top7colors, name="Publisher")+
guides(fill = guide_legend(title.position = "top")) +
#scale_y_continuous(breaks = seq(from = 0, to=5000, by=500)) +
#coord_cartesian(ylim = c(0,5000)) +
labs(x = "Institution", y="Count of Data DOIs", caption = "Note: Michigan Dataverse bar cutoff for scaling") +
theme_bw() +
theme(legend.position = "bottom", legend.title.align = .5))
ggsave(by_publisher_data_plot_collapse, filename = "figures/Counts of DataCite Data DOIs by Institution_DOIcollapsed.png", device = "png", width = 8, height = 6, units="in")
Write out CSV files for each institution:
for (i in unique(all_dois2$institution)) {
all_dois %>%
filter(institution == i) %>%
write.csv(file=paste0("data_all_dois/All_dois_", i, gsub("-", "", Sys.Date()), ".csv"), row.names = F)
all_dois_collapsed %>%
filter(institution == i) %>%
write.csv(file=paste0("data_all_dois/All_dois_collapsed_", i, gsub("-", "", Sys.Date()), ".csv"), row.names = F)
}